library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(expss)
library(ClusterR)
library(DESeq2) ; library(biomaRt) ; library(limma)
library(knitr)

Load preprocessed dataset (preprocessing code in 01_data_preprocessing.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
datMeta = datMeta %>% mutate(ID = paste0('X',description))

# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)

# Update DE_info with Neuronal information
DE_info = DE_info %>% mutate('ID'=datGenes$ensembl_gene_id) %>% left_join(GO_neuronal, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  dplyr::rename(log2FoldChange = 'logFC', padj = 'adj.P.Val') %>%
  mutate(significant=padj<0.05 & !is.na(padj))


rm(GO_annotations)


Mean Level of Expression


All samples together


  • The distributions seem pretty homogeneous
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr))

p1 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
     xlab('Mean Expression') + ylab('Density') + ggtitle('Mean Expression distribution by Gene') +
     theme_minimal()

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr))

p2 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
     xlab('Mean Expression') + ylab('Density') +
     theme_minimal() + ggtitle('Mean expression distribution by Sample')

grid.arrange(p1, p2, nrow=1)

rm(p1, p2, plot_data)

Grouping samples by Phenotype


The differences in level of expression between Brain Region are not statistically significant

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% 
            left_join(datMeta, by='ID')

plot_data %>% ggplot(aes(Brain_lobe, Mean, fill = Brain_lobe)) + 
     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
     stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) + 
     xlab('Brain Lobe') + ylab('') + theme_minimal() + theme(legend.position = 'none')

Grouping genes by Neuronal tag and samples by Diagnosis


  • The two groups of genes seem to be partially characterised by genes with Neuronal function

  • In general, the ASD group has a higher dispersion than the control group

  • Only the difference by neuronal function is statistically significant

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr)) %>% 
            left_join(GO_neuronal, by='ID') %>% mutate('Neuronal'=ifelse(is.na(Neuronal),F,T))

p1 = plot_data %>% ggplot(aes(Mean, color=Neuronal, fill=Neuronal)) + geom_density(alpha=0.3) +
                   scale_x_log10() + theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression by gene')

p3 = plot_data %>% ggplot(aes(Neuronal, Mean, fill = Neuronal)) + 
                   geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
                   stat_compare_means(label = 'p.signif', method = 't.test', 
                                      method.args = list(var.equal = FALSE)) + theme_minimal() + 
                   ylab('Mean Expression') + theme(legend.position = 'none')

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% left_join(datMeta, by='ID')

p2 = plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + geom_density(alpha=0.3) +
                   theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression by Sample')

p4 = plot_data %>% ggplot(aes(Diagnosis, Mean, fill = Diagnosis)) + 
                   geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
                   stat_compare_means(label = 'p.signif', method = 't.test', 
                                      method.args = list(var.equal = FALSE)) + theme_minimal() +
                   ylab('Mean Expression') + theme(legend.position = 'none')


grid.arrange(p1, p2, p3, p4, nrow=2)

rm(GO_annotations, plot_data, p1, p2, p3, p4)


Grouping genes and samples by Diagnosis

In general there doesn’t seem to be a lot of variance in mean expression between autism and control samples by gene

plot_data = data.frame('ID'=rownames(datExpr),
                       'ASD'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']),
                       'CTL'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD'])) %>%
                       mutate(diff=ASD-CTL, abs_diff = abs(ASD-CTL)) %>%
                       mutate(std_diff = (diff-mean(diff))/sd(diff), distance = abs((diff-mean(diff))/sd(diff)))

plot_data %>% ggplot(aes(ASD, CTL, color = distance)) + geom_point(alpha = plot_data$abs_diff) + 
              geom_abline(color = 'gray') + scale_color_viridis(direction = -1) + 
              ggtitle('Mean expression ASD vs CTL') + theme_minimal() + coord_fixed()

summary(plot_data$std_diff)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -12.764425  -0.493884   0.006659   0.000000   0.490468   9.009259
#cat(paste0('Outlier genes: ', paste(plot_data$ID[abs(plot_data$std_diff)>3], collapse = ', ')))

There are 225 genes with a difference between Diagnoses larger than 3 SD to the distance distribution of all genes. Gene ENSG00000129824 has the largest difference in mean expression between ASD and CTL, but it has a low level of expression, so it probably won’t be statistically significant (we confirm this in the Differential Expression Section)


  • There doesn’t seem to be a noticeable difference between mean expression by gene between Diagnosis groups

  • Samples with autism tend to have higher values than the control group (as we had already seen above)

plot_data = rbind(data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + scale_x_log10() + theme_minimal())

plot_data = rbind(data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + theme_minimal() +
              ggtitle('Mean expression by Gene (left) and by Sample (right) grouped by Diagnosis'))

subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)




Visualisations


Samples


PCA


The first principal component seems to separate relatively well the two Diagnosis

pca = datExpr %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% 
            left_join(datMeta, by='ID') %>% dplyr::select('ID','PC1','PC2','Diagnosis') %>% 
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))

plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(alpha = 0.8) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              theme_minimal() + ggtitle('PCA of Samples')

rm(pca, plot_data)


MDS


Looks exactly the same as the PCA visualisation, just inverting the both axes

d = datExpr %>% t %>% dist
fit = cmdscale(d, k=2)

plot_data = data.frame('ID'=colnames(datExpr), 'C1'=fit[,1], 'C2'=fit[,2]) %>% left_join(datMeta, by='ID') %>% 
            dplyr::select('C1','C2','Diagnosis') %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))

plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point(alpha = 0.8) + theme_minimal() + ggtitle('MDS')

rm(d, fit, plot_data)


t-SNE


t-SNE manages to group samples by diagnosis with more than one perplexity

perplexities = c(1,3,5,8)
ps = list()

for(i in 1:length(perplexities)){
  set.seed(123)
  tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
  plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>% 
              left_join(datMeta, by='ID') %>%
              dplyr::select('C1','C2','Diagnosis','Subject_ID') %>%
              mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point() + theme_minimal() +
            ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)

rm(ps, perplexities, tsne, i)


In the Gandal dataset, the higher perplexity values managed to capture the subject the samples belonged to, it seems to do the same here, perhaps even clearer

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(aes(id=Subject_ID)) + theme_minimal() + 
         theme(legend.position='none') + ggtitle('t-SNE Perplexity=30 coloured by Subject ID'))
rm(plot_data)

Genes


PCA


  • The First Principal Component explains over 99% of the total variance

  • There’s a really strong correlation between the mean expression of a gene and the 1st principal component

pca = datExpr %>% prcomp

plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() + 
              scale_color_viridis() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)


t-SNE


Higher perplexities capture a cleaner visualisation of the data ordered by mean expression, in a similar (although not as linear) way to PCA

perplexities = c(1,2,5,10,50,100)
ps = list()

for(i in 1:length(perplexities)){
  tsne = read.csv(paste0('./../Visualisations/tsne_perplexity_',perplexities[i],'.csv'))
  plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() +
            scale_color_viridis() + ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)

rm(perplexities, ps, tsne, i)




Differential Expression Analysis


table(DE_info$padj<0.05, useNA='ifany')
## 
## FALSE  TRUE 
## 13066  2449
p = DE_info %>% ggplot(aes(log2FoldChange, padj, color=significant)) + geom_point(alpha=0.2) + 
    scale_y_sqrt() + xlab('Log Fold Change') + ylab('Adjusted p-value') + theme_minimal()
ggExtra::ggMarginal(p, type = 'density', color='gray', fill='gray', size=10)

rm(p)
plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% left_join(DE_info, by='ID') %>%
            mutate('statisticallySignificant' = ifelse(is.na(padj),NA, ifelse(padj<0.05, TRUE, FALSE)),
                   'alpha' = ifelse(padj>0.05 | is.na(padj), 0.1, 0.5))

plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color=statisticallySignificant)) + 
              geom_point(alpha = plot_data$alpha) + geom_smooth(method='lm') + 
              theme_minimal() + scale_y_sqrt() + theme(legend.position = 'bottom') +
              xlab('Mean Expression') + ylab('LFC Magnitude') + 
              ggtitle('Log fold change by level of expression')

datExpr_DE = datExpr[DE_info$significant,]

pca = datExpr_DE %>% prcomp

plot_data = cbind(data.frame('PC1'=pca$x[,1], 'PC2'=pca$x[,2]), DE_info[DE_info$significant==TRUE,])

pos_zero = -min(plot_data$log2FoldChange)/(max(plot_data$log2FoldChange)-min(plot_data$log2FoldChange))
p = plot_data %>% ggplot(aes(PC1, PC2, color=log2FoldChange)) + geom_point(alpha=0.5) +
                  scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'), 
                                        values=c(0, pos_zero-0.05, pos_zero, pos_zero+0.05, 1)) +
                  theme_minimal() + ggtitle('
PCA of differentially expressed genes') + # This is on purpose, PDF doesn't save well without this white space (?)
                  xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
                  ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) + 
                  theme(legend.position = 'bottom')
ggExtra::ggMarginal(p, type='density', color='gray', fill='gray', size=10)

rm(pos_zero, p)

Separating the genes into these two groups: Salmon: under-expressed, aqua: over-expressed

plot_data = plot_data %>% mutate('Group'=ifelse(log2FoldChange>0,'overexpressed','underexpressed')) %>%
            mutate('Group' = factor(Group, levels=c('underexpressed','overexpressed')))

List of top DE genes

# Get genes names
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=plot_data$ID, mart=mart) %>% 
             rename(external_gene_id = 'gene_name', ensembl_gene_id = 'ID')
## Batch submitting query [==================>------------] 60% eta: 0s Batch
## submitting query [========================>------] 80% eta: 0s Batch submitting
## query [===============================] 100% eta: 0s
top_genes = plot_data %>% left_join(gene_names, by='ID') %>% arrange(-abs(log2FoldChange)) %>% 
            top_n(50, wt=log2FoldChange)

kable(top_genes %>% dplyr::select(ID, gene_name, log2FoldChange, padj, Neuronal, Group))
ID gene_name log2FoldChange padj Neuronal Group
ENSG00000173915 USMG5 0.7612407 0.0014621 0 overexpressed
ENSG00000182103 FAM181B 0.6627976 0.0000757 0 overexpressed
ENSG00000196136 SERPINA3 0.6095301 0.0430646 0 overexpressed
ENSG00000073756 PTGS2 0.5214530 0.0090993 1 overexpressed
ENSG00000106211 HSPB1 0.5163140 0.0001058 0 overexpressed
ENSG00000101327 PDYN 0.5029224 0.0004849 0 overexpressed
ENSG00000158373 HIST1H2BD 0.4482958 0.0000048 0 overexpressed
ENSG00000175206 NPPA 0.4440241 0.0005972 0 overexpressed
ENSG00000124762 CDKN1A 0.4355518 0.0070656 0 overexpressed
ENSG00000108691 CCL2 0.4305914 0.0163803 1 overexpressed
ENSG00000204219 TCEA3 0.4305223 0.0000474 0 overexpressed
ENSG00000163520 FBLN2 0.4182056 0.0000399 0 overexpressed
ENSG00000114200 BCHE 0.4179747 0.0004748 0 overexpressed
ENSG00000132002 DNAJB1 0.4176425 0.0164132 0 overexpressed
ENSG00000012963 UBR7 0.4127440 0.0000158 0 overexpressed
ENSG00000173486 FKBP2 0.4124107 0.0006110 0 overexpressed
ENSG00000105697 HAMP 0.4089664 0.0489586 0 overexpressed
ENSG00000180573 HIST1H2AC 0.3883532 0.0000066 0 overexpressed
ENSG00000136235 GPNMB 0.3875091 0.0167178 0 overexpressed
ENSG00000101298 SNPH 0.3873132 0.0001051 1 overexpressed
ENSG00000159189 C1QC 0.3855800 0.0007581 0 overexpressed
ENSG00000221869 CEBPD 0.3805648 0.0010027 0 overexpressed
ENSG00000134531 EMP1 0.3798648 0.0191595 0 overexpressed
ENSG00000184678 HIST2H2BE 0.3754434 0.0000087 0 overexpressed
ENSG00000134853 PDGFRA 0.3710305 0.0031163 0 overexpressed
ENSG00000026559 KCNG1 0.3661942 0.0358647 0 overexpressed
ENSG00000204347 BTBD17 0.3633140 0.0081243 0 overexpressed
ENSG00000197019 SERTAD1 0.3611023 0.0017659 0 overexpressed
ENSG00000103260 METRN 0.3610544 0.0004111 0 overexpressed
ENSG00000177707 PVRL3 0.3541528 0.0004741 0 overexpressed
ENSG00000197903 HIST1H2BK 0.3541393 0.0000004 0 overexpressed
ENSG00000149257 SERPINH1 0.3534902 0.0206716 0 overexpressed
ENSG00000113742 CPEB4 0.3426404 0.0065112 1 overexpressed
ENSG00000164253 WDR41 0.3389893 0.0052486 0 overexpressed
ENSG00000126767 ELK1 0.3337928 0.0001943 0 overexpressed
ENSG00000189306 RRP7A 0.3335859 0.0002759 0 overexpressed
ENSG00000162623 TYW3 0.3311540 0.0003521 0 overexpressed
ENSG00000069667 RORA 0.3291238 0.0024389 0 overexpressed
ENSG00000160049 DFFA 0.3285121 0.0000012 0 overexpressed
ENSG00000134086 VHL 0.3280740 0.0001907 0 overexpressed
ENSG00000170458 CD14 0.3219690 0.0288669 0 overexpressed
ENSG00000134769 DTNA 0.3157405 0.0096716 0 overexpressed
ENSG00000150551 LYPD1 0.3156217 0.0033662 0 overexpressed
ENSG00000130208 APOC1 0.3123769 0.0430646 0 overexpressed
ENSG00000137193 PIM1 0.3120680 0.0001198 0 overexpressed
ENSG00000154478 GPR26 0.3117004 0.0027446 0 overexpressed
ENSG00000177606 JUN 0.3112235 0.0002654 1 overexpressed
ENSG00000074410 CA12 0.3099400 0.0008752 0 overexpressed
ENSG00000144566 RAB5A 0.3085956 0.0002759 0 overexpressed
ENSG00000106624 AEBP1 0.3078414 0.0007859 0 overexpressed
rm(top_genes)

Plotting the mean expression by group in Gandal’s dataset there seemed to exist underlying distributions, so we would use GMM to separate them, but everything seems very homogeneous here, so this doesn’t seem to be necessary. (If we do it anyway we can see that they still cluster by mean expression, which makes sense since it explains the majority of the variance of the genes)

gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

tot_n_clusters = 4

plot_data = plot_data %>% mutate('MeanExpr'=rowMeans(datExpr_DE), 'SDExpr'=apply(datExpr_DE,1,sd))

GMM_G1 = plot_data %>% filter(Group=='overexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)
GMM_G2 = plot_data %>% filter(Group=='underexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)

memberships_G1 = data.frame('ID'=plot_data$ID[plot_data$Group=='overexpressed'],
                            'Membership'=GMM_G1$Log_likelihood %>% 
                            apply(1, function(x) glue('over_', which.max(x))))
memberships_G2 = data.frame('ID'=plot_data$ID[plot_data$Group=='underexpressed'],
                            'Membership'=GMM_G2$Log_likelihood %>% 
                             apply(1, function(x) glue('under_', which.max(x))))

plot_data = rbind(memberships_G1, memberships_G2) %>% left_join(plot_data, by='ID')

p1 = plot_data %>% ggplot(aes(x=MeanExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + 
     theme_minimal() + theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(x=Group, y=MeanExpr, fill=Group)) + ylab('Mean Expression') + xlab('') +
     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
     stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
     theme_minimal() + theme(legend.position='none')

p3 = plot_data %>% ggplot(aes(x=MeanExpr)) + ylab('Density') +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[1],
                   args=list(mean=GMM_G1$centroids[1], sd=GMM_G1$covariance_matrices[1])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[2],
                   args=list(mean=GMM_G1$centroids[2], sd=GMM_G1$covariance_matrices[2])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[3],
                   args=list(mean=GMM_G1$centroids[3], sd=GMM_G1$covariance_matrices[3])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[4],
                   args=list(mean=GMM_G2$centroids[1], sd=GMM_G2$covariance_matrices[1])) +
     theme_minimal()

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=Membership)) + geom_point(alpha=0.4) + theme_minimal() + 
     theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(gg_color_hue, n_clusters, GMM_G1, GMM_G2, memberships_G1, memberships_G2, p1, p2, p3, tot_n_clusters)

For previous preprocessing pipelines, the pattern found above was also present in the standard deviation, but there doesn’t seem to be any strong patterns now. This could be because the variance was almost homogenised with the vst normalisation algorithm.

plot_data %>% ggplot(aes(x=SDExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + 
              theme_minimal() + theme(legend.position = 'bottom')

rm(plot_data)



Effects of modifying the log fold change threshold


fc_list = seq(1, 1.4, 0.01)

n_genes = nrow(datExpr)

# Calculate PCAs
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr %>% data.frame %>% prcomp(scale.=TRUE)

# Initialise DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=colnames(datExpr), 'fc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=rownames(datExpr), 'fc'=-1, PC1=scale(PC1), PC2=scale(PC2))

pca_samps_old = pcas_samps
pca_genes_old = pcas_genes

for(fc in fc_list){
  
  # Recalculate DE_info with the new threshold (p-values change) an filter DE genes
  DE_genes = topTable(efit, coef=2, number=Inf, sort.by='none', lfc = log2(fc)) %>% data.frame %>%
             left_join(data.frame('ID' = rownames(datExpr), 'AveExpr' = rowMeans(datExpr)), by = 'AveExpr') %>%
             mutate(padj = adj.P.Val) %>% filter(padj<0.05)
  
  datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
  n_genes = c(n_genes, nrow(DE_genes))
  
  # Calculate PCAs
  datExpr_pca_samps = datExpr_DE %>% t %>% prcomp(scale.=TRUE)
  datExpr_pca_genes = datExpr_DE %>% prcomp(scale.=TRUE)

  # Create new DF entries
  pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=colnames(datExpr), 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))
  pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=DE_genes$ID, 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))  
  
  # Change PC sign if necessary
  if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
  if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
  if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC1,
         pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1)<0){
    pca_genes_new$PC1 = -pca_genes_new$PC1
  }
  if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC2, 
         pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
    pca_genes_new$PC2 = -pca_genes_new$PC2
  }
  
  pca_samps_old = pca_samps_new
  pca_genes_old = pca_genes_new
  
  # Update DFs
  pcas_samps = rbind(pcas_samps, pca_samps_new)
  pcas_genes = rbind(pcas_genes, pca_genes_new)
  
}

# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% left_join(datMeta, by='ID') %>% 
             dplyr::select(ID, PC1, PC2, fc, Diagnosis, Brain_lobe)

# Plot change of number of genes
ggplotly(data.frame('lfc'=log2(fc_list), 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) + 
         geom_point() + geom_line() + theme_minimal() + xlab('Log Fold Change Threshold') + ylab('DE Genes') +
         ggtitle('Number of Differentially Expressed genes when modifying filtering threshold'))
rm(fc_list, n_genes, fc, pca_samps_new, pca_genes_new, pca_samps_old, pca_genes_old, 
   datExpr_pca_samps, datExpr_pca_genes)


Samples

Note: PC values get smaller as LFC increases, so on each iteration the values were scaled so it would be easier to compare between frames

Coloured by Diagnosis:

  • LFC = -1 represents the whole set of genes, without any filtering by differential expression

  • The LFC threshold doesn’t seem to make a big difference for differentiating genes by Diagnosis

ggplotly(pcas_samps %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),2))) %>% 
         ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=abs_lfc, ids=ID), alpha=0.7) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))


Coloured by brain region:


There doesn’t seem to be a strong recognisable pattern

ggplotly(pcas_samps %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),2))) %>% 
         ggplot(aes(PC1, PC2, color=Brain_lobe)) + geom_point(aes(frame=abs_lfc, ids=ID), alpha = 0.7) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))


Genes

if(!'fcSign' %in% colnames(pcas_genes)){
  pcas_genes = pcas_genes %>% left_join(DE_info, by='ID') %>% 
               mutate(LFCSign = ifelse(log2FoldChange>0,'Positive','Negative')) 
}

ggplotly(pcas_genes %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),2))) %>% 
         ggplot(aes(PC1, PC2, color=LFCSign)) + geom_point(aes(frame=abs_lfc, ids=ID), alpha=0.2) + 
         theme_minimal() + ggtitle('Genes PCA plot modifying LFC Magnitude filtering threshold'))




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.28                  limma_3.40.6               
##  [3] biomaRt_2.40.5              DESeq2_1.24.0              
##  [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
##  [7] BiocParallel_1.18.1         matrixStats_0.56.0         
##  [9] Biobase_2.44.0              GenomicRanges_1.36.1       
## [11] GenomeInfoDb_1.20.0         IRanges_2.18.3             
## [13] S4Vectors_0.22.1            BiocGenerics_0.30.0        
## [15] ClusterR_1.2.1              gtools_3.8.2               
## [17] expss_0.10.2                Rtsne_0.15                 
## [19] ggpubr_0.2.5                magrittr_1.5               
## [21] GGally_1.5.0                gridExtra_2.3              
## [23] viridis_0.5.1               viridisLite_0.3.0          
## [25] RColorBrewer_1.1-2          plotlyutils_0.0.0.9000     
## [27] plotly_4.9.2                glue_1.4.1                 
## [29] reshape2_1.4.4              forcats_0.5.0              
## [31] stringr_1.4.0               dplyr_1.0.0                
## [33] purrr_0.3.4                 readr_1.3.1                
## [35] tidyr_1.1.0                 tibble_3.0.1               
## [37] ggplot2_3.3.2               tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        Hmisc_4.4-0           
##   [4] plyr_1.8.6             lazyeval_0.2.2         splines_3.6.3         
##   [7] gmp_0.5-13.6           crosstalk_1.1.0.1      digest_0.6.25         
##  [10] htmltools_0.4.0        fansi_0.4.1            checkmate_2.0.0       
##  [13] memoise_1.1.0          cluster_2.1.0          annotate_1.62.0       
##  [16] modelr_0.1.6           prettyunits_1.1.1      jpeg_0.1-8.1          
##  [19] colorspace_1.4-1       blob_1.2.1             rvest_0.3.5           
##  [22] haven_2.2.0            xfun_0.12              crayon_1.3.4          
##  [25] RCurl_1.98-1.2         jsonlite_1.7.0         genefilter_1.66.0     
##  [28] survival_3.1-12        gtable_0.3.0           zlibbioc_1.30.0       
##  [31] XVector_0.24.0         scales_1.1.1           DBI_1.1.0             
##  [34] miniUI_0.1.1.1         Rcpp_1.0.4.6           xtable_1.8-4          
##  [37] progress_1.2.2         htmlTable_1.13.3       foreign_0.8-76        
##  [40] bit_1.1-15.2           Formula_1.2-3          htmlwidgets_1.5.1     
##  [43] httr_1.4.1             acepack_1.4.1          ellipsis_0.3.1        
##  [46] pkgconfig_2.0.3        reshape_0.8.8          XML_3.99-0.3          
##  [49] farver_2.0.3           nnet_7.3-14            dbplyr_1.4.2          
##  [52] locfit_1.5-9.4         later_1.0.0            tidyselect_1.1.0      
##  [55] labeling_0.3           rlang_0.4.6            AnnotationDbi_1.46.1  
##  [58] munsell_0.5.0          cellranger_1.1.0       tools_3.6.3           
##  [61] cli_2.0.2              generics_0.0.2         RSQLite_2.2.0         
##  [64] broom_0.5.5            fastmap_1.0.1          evaluate_0.14         
##  [67] yaml_2.2.1             bit64_0.9-7            fs_1.4.0              
##  [70] nlme_3.1-147           mime_0.9               ggExtra_0.9           
##  [73] xml2_1.2.5             compiler_3.6.3         rstudioapi_0.11       
##  [76] curl_4.3               png_0.1-7              ggsignif_0.6.0        
##  [79] reprex_0.3.0           geneplotter_1.62.0     stringi_1.4.6         
##  [82] highr_0.8              lattice_0.20-41        Matrix_1.2-18         
##  [85] vctrs_0.3.1            pillar_1.4.4           lifecycle_0.2.0       
##  [88] data.table_1.12.8      bitops_1.0-6           httpuv_1.5.2          
##  [91] R6_2.4.1               latticeExtra_0.6-29    promises_1.1.0        
##  [94] assertthat_0.2.1       withr_2.2.0            GenomeInfoDbData_1.2.1
##  [97] mgcv_1.8-31            hms_0.5.3              grid_3.6.3            
## [100] rpart_4.1-15           rmarkdown_2.1          shiny_1.4.0.2         
## [103] lubridate_1.7.4        base64enc_0.1-3